In [1]:
from sklearn.svm import SVC
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot as plt
from pandas.tools.plotting import scatter_matrix
%matplotlib inline
In [5]:
data = pd.read_csv("../../Data/WeatherOutagesAll_RK.csv")
In [8]:
data.head()
Out[8]:
In [7]:
for i in range(data.shape[0]):
if data.iloc[i].Total_outages < 3:
data.loc[i, "outage_class"] = int(0)
elif data.iloc[i].Total_outages > 7:
data.loc[i, "outage_class"] = int(2)
else:
data.loc[i, "outage_class"] = int(1)
In [34]:
scaler = StandardScaler()
In [35]:
#grid search
xTrain = None
yTrain = None
xTest = None
yTest = None
df = None
for train, test in splitter.split(data):
xTrain = scaler.fit_transform(data.iloc[train, 1:-1])
yTrain = data.iloc[train, -1]
xTest = scaler.transform(data.iloc[test, 1:-1])
yTest = data.iloc[test, -1]
cRange = np.logspace(-3, 3, 7)
gammaRange = np.logspace(-3, 3, 7)
paramGrid = dict(gamma=gammaRange, C=cRange)
grid = GridSearchCV(SVC(cache_size=1000.0, verbose=1, class_weight="balanced"),
param_grid=paramGrid, cv=KFold(n_splits=5, shuffle=True), verbose=True)
grid.fit(xTrain, yTrain)
print("The best parameters are %s with a score of %0.2f"
% (grid.best_params_, grid.best_score_))
df = pd.DataFrame(grid.cv_results_)
df.to_csv("resultsRBF1.csv")
Need a metric to correctly predict class 1 and 2
In [65]:
xTrain = None
yTrain = None
xTest = None
yTest = None
nSplits = 10
shuffleSplitter = ShuffleSplit(n_splits=nSplits, test_size=0.2)
classifier = SVC(C=100, gamma=0.1, cache_size=1000.0, class_weight="balanced", probability=True)
df = pd.DataFrame()
for cValue in np.logspace(-5, 5, 11):
for gammaValue in np.logspace(-5, 5, 11):
classifier.set_params(C = cValue, gamma = gammaValue)
allCorrectness = []
allFalseNegativesBad = []
allFalseNegativesExtreme = []
allFalsePositives = []
allBadDays = []
allExtremeDays = []
for train, test in shuffleSplitter.split(data):
overallCorrectness = 0
falseNegativesBad = 0
falseNegativesExtreme = 0
falsePositives = 0
badDays = 0
extremeDays = 0
xTrain = scaler.fit_transform(data.iloc[train, 1:-1])
yTrain = data.iloc[train, -1]
xTest = scaler.transform(data.iloc[test, 1:-1])
yTest = data.iloc[test, -1]
classifier.fit(xTrain, yTrain)
yPredict = classifier.predict(xTest)
yTest = yTest.tolist()
yPredict = yPredict.tolist()
for i in range(len(yTest)):
if yTest[i] == 1:
badDays += 1
elif yTest[i] == 2:
extremeDays += 1
if yTest[i] == yPredict[i]:
overallCorrectness += 1
elif yTest[i] < yPredict[i]:
falsePositives += 1
elif yTest[i] == 1 and yPredict[i] == 0:
falseNegativesBad += 1
else: # yTest[i] == 2 and yPredict[i] != 2
falseNegativesExtreme += 1
allCorrectness.append(overallCorrectness / len(yTest))
allBadDays.append(badDays)
allExtremeDays.append(extremeDays)
if badDays != 0:
allFalseNegativesBad.append(falseNegativesBad / badDays)
allFalsePositives.append(falsePositives / (badDays + extremeDays))
else:
allFalseNegativesBad.append(0)
allFalsePositives.append(0)
if extremeDays != 0:
allFalseNegativesExtreme.append(falseNegativesExtreme / extremeDays)
else:
allFalseNegativesExtreme.append(0)
df = df.append({"C": cValue, "gamma": gammaValue, "Overall_Correctness": np.mean(allCorrectness),
"False_Negatives_Extreme": np.mean(allFalseNegativesExtreme),
"False_Negatives_Bad": np.mean(allFalseNegativesBad),
"False_Positives": np.mean(allFalsePositives),
"Bad_Days": np.mean(allbadDays),
"Extreme_Days": np.mean(allExtremeDays)}, ignore_index = True)
df.to_csv("gridSearch2.csv")
Out[65]:
In [56]:
xTrain = None
yTrain = None
scaler = StandardScaler()
classifier = SVC(C=10, gamma=0.01, class_weight="balanced", probability=True)
xTrain = scaler.fit_transform(data.iloc[:, 1:-1])
yTrain = data.iloc[:, -1]
classifier.fit(xTrain, yTrain)
Out[56]:
In [82]:
from sklearn.externals import joblib
joblib.dump(classifier, 'SVCmodel.pkl')
joblib.dump(scaler, 'scaler.pkl')
Out[82]:
In [12]:
from sklearn.externals import joblib
classifier = joblib.load("SVCmodel.pkl")
scaler = joblib.load("scaler.pkl")
bar = np.array([[12,70,80,8,5,10,0.]])
bar = bar.reshape(1,-1)
bar2 = scaler.transform(bar)
print(classifier.predict(bar2))
classifier.predict_proba(bar2)
Out[12]:
In [13]:
def predictOutage(weatherData):
if weatherData.shape[1] != 7:
raise ValueError("7 features are required. See documentation.")
model = joblib.load("SVCmodel.pkl")
scaler = joblib.load("scaler.pkl")
scaledData = scaler.transform(weatherData)
return model.predict_proba(weatherData)
In [7]:
from sklearn.externals import joblib
bar = np.array([[12,70,80,8,5,10,0.]])
def predictOutageProba(weatherData):
if weatherData.shape[1] != 7:
raise ValueError("7 features are required. See documentation.")
model = joblib.load("SVCmodel.pkl")
scaler = joblib.load("scaler.pkl")
scaledData = scaler.transform(weatherData)
return model.predict_proba(scaledData)
predictOutageProba(bar)
Out[7]:
Plot data distribution
In [101]:
plt.hist(data.Total_outages, bins=50)
plt.show()
In [ ]: